rm(list=ls())
library(plyr)
library(lubridate)
library(dplyr)
library(ggplot2)
library(ROCR)
library(ggmap)
library(rpart)
library(curl)
library(jsonlite)
library(broom)
library(rgdal)
library(scales)
library(rpart.plot)
library(RColorBrewer)
library(randomForest)
library(ggRandomForests)
Graph Injuries
#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#Filter out intersections w/o injuries
d.in <- d.in %>%
filter(MN != 0)
#All Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=Injuries_COUNT)) +
geom_bar(stat="identity", fill="steelblue") +
labs(x = "Month", y = "Injuries") +
ggtitle("Monthly Injuries (2016)")
plot(g)
#Pedestrian Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=PedInjurie_COUNT)) +
geom_bar(stat="identity", fill="steelblue") +
labs(x = "Month", y = "Injuries") +
ggtitle("Pedestrian Injuries (2016)")
plot(g)
#Bike Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=BikeInjuri_COUNT)) +
geom_bar(stat="identity", fill="steelblue") +
labs(x = "Month", y = "Injuries") +
ggtitle("Bike Injuries (2016)")
plot(g)
#Motor Vehicle Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=MVOInjurie_COUNT)) +
geom_bar(stat="identity", fill="steelblue") +
labs(x = "Month", y = "Injuries") +
ggtitle("Motor Vehicle Injuries (2016)")
plot(g)
Plot injuries to map
d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
# Set a base map
nyc <- get_map(location = c(lon = -73.935, lat = 40.712), zoom = 11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.712,-73.935&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
#Bike injuries by month
bike_inj <- d.in %>% filter(Injuries == 1 & BikeInjuri == 1)
g <- ggmap(nyc) + geom_point(data = bike_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Bike Injuries (2016)")
plot(g)
## Warning: Removed 51 rows containing missing values (geom_point).
#bike_zones <- readOGR("http://www.nyc.gov/html/dot/downloads/misc/bike_priority_districts.json")
#g <- ggmap(nyc) + ggmap(bike_zones)
#multiplot(g, )
#Pedestrian injuries by month
ped_inj <- d.in %>% filter(Injuries == 1 & PedInjurie == 1)
g <- ggmap(nyc) + geom_point(data = ped_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Pedestrian Injuries (2016)")
plot(g)
## Warning: Removed 274 rows containing missing values (geom_point).
#MVO injuries by month
mvo_inj <- d.in %>% filter(Injuries == 1 & MVOInjurie == 1)
g <- ggmap(nyc) + geom_point(data = mvo_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Motor Vehicle Injuries (2016)")
plot(g)
## Warning: Removed 908 rows containing missing values (geom_point).
Format variables
#format dataset - turn variables into factors
d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#transform variables into factors
d.in <- d.in %>%
mutate(Injuries = as.factor(Injuries),
PedInjurie = as.factor(PedInjurie),
BikeInjuri = as.factor(BikeInjuri),
MVOInjurie = as.factor(MVOInjurie),
bike_priority_districts = as.factor(bike_priority_districts),
enhanced_crossings = as.factor(enhanced_crossings),
left_turn_traffic_calming = as.factor(left_turn_traffic_calming),
neighborhood_slow_zones = as.factor(neighborhood_slow_zones),
leading_pedestrian_interval_signals = as.factor(leading_pedestrian_interval_signals),
signal_timing = as.factor(signal_timing),
speed_humps = as.factor(speed_humps),
safe_streets_for_seniors = as.factor(safe_streets_for_seniors),
street_improvement_projects_corridors = as.factor(street_improvement_projects_corridors),
vz_priority_corridors = as.factor(vz_priority_corridors),
vz_priority_intersections = as.factor(vz_priority_intersections),
arterial_slow_zones = as.factor(arterial_slow_zones),
MN = as.factor(MN),
street_improvement_projects_corridors = as.factor(street_improvement_projects_corridors),
vz_priority_zones = as.factor(vz_priority_zones))
Multiple Logistic Regressions
#GLM - AllInjures (minus VZ & Bike Zone)
# VZ and Bike Zones are not street attributes - just known problem areas
injury_monthly.glm <- glm(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
summary(injury_monthly.glm)
##
## Call:
## glm(formula = Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming +
## arterial_slow_zones + leading_pedestrian_interval_signals +
## signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors +
## neighborhood_slow_zones, family = "binomial", data = d.in)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6154 -0.4972 -0.4972 -0.4972 2.1272
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.02827 0.00928 -218.568
## enhanced_crossings1 0.41391 0.20585 2.011
## speed_humps1 1.01748 0.02673 38.070
## left_turn_traffic_calming1 3.83208 0.22995 16.665
## arterial_slow_zones1 0.23084 0.03411 6.766
## leading_pedestrian_interval_signals1 2.59419 0.04496 57.694
## signal_timing1 1.57612 0.02827 55.755
## safe_streets_for_seniors1 0.55982 0.02347 23.849
## street_improvement_projects_corridors1 1.45912 0.02212 65.955
## neighborhood_slow_zones1 -0.12423 0.04753 -2.614
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## enhanced_crossings1 0.04435 *
## speed_humps1 < 2e-16 ***
## left_turn_traffic_calming1 < 2e-16 ***
## arterial_slow_zones1 1.32e-11 ***
## leading_pedestrian_interval_signals1 < 2e-16 ***
## signal_timing1 < 2e-16 ***
## safe_streets_for_seniors1 < 2e-16 ***
## street_improvement_projects_corridors1 < 2e-16 ***
## neighborhood_slow_zones1 0.00896 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 139436 on 140900 degrees of freedom
## Residual deviance: 114384 on 140891 degrees of freedom
## AIC: 114404
##
## Number of Fisher Scoring iterations: 6
exp(coef((injury_monthly.glm)))
## (Intercept)
## 0.1315631
## enhanced_crossings1
## 1.5127275
## speed_humps1
## 2.7662042
## left_turn_traffic_calming1
## 46.1585062
## arterial_slow_zones1
## 1.2596567
## leading_pedestrian_interval_signals1
## 13.3857050
## signal_timing1
## 4.8361757
## safe_streets_for_seniors1
## 1.7503509
## street_improvement_projects_corridors1
## 4.3021682
## neighborhood_slow_zones1
## 0.8831721
exp(confint((injury_monthly.glm)))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1291867 0.1339726
## enhanced_crossings1 1.0047420 2.2559794
## speed_humps1 2.6247880 2.9147078
## left_turn_traffic_calming1 30.1608649 74.6309713
## arterial_slow_zones1 1.1779719 1.3465245
## leading_pedestrian_interval_signals1 12.2640092 14.6281904
## signal_timing1 4.5756508 5.1118613
## safe_streets_for_seniors1 1.6714930 1.8325959
## street_improvement_projects_corridors1 4.1195485 4.4927555
## neighborhood_slow_zones1 0.8041116 0.9688283
#build table to plot Odds Ratios
a <- data.frame(exp(coef((injury_monthly.glm))))
a <- cbind(attribute = rownames(a), a)
rownames(a) <- 1:nrow(a)
b <- data.frame(exp(confint((injury_monthly.glm))))
## Waiting for profiling to be done...
b <- cbind(attribute = rownames(b), b)
rownames(b) <- 1:nrow(b)
coef_df <- left_join(a, b, by = "attribute")
colnames(coef_df)[2] <- "coefficient"
colnames(coef_df)[3] <- "lower_bound"
colnames(coef_df)[4] <- "upper_bound"
g <- ggplot(coef_df, aes(x = attribute, y = coefficient)) +
geom_point(size = 2) +
geom_errorbar(aes(ymax = upper_bound, ymin = lower_bound), width=0.1, color="darkred") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(data = coef_df, aes(label = round(coefficient, digits=3)), hjust = 1.2) +
ggtitle("Injuries - Coefficients & Confidence Intervals")
plot(g)
#GLM - Pedestrian Injuries
ped_injury <- glm(PedInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
summary(ped_injury)
##
## Call:
## glm(formula = PedInjurie ~ arterial_slow_zones + enhanced_crossings +
## speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals +
## street_improvement_projects_corridors + neighborhood_slow_zones,
## family = "binomial", data = d.in)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2715 -0.2758 -0.2758 -0.2758 2.5645
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -3.25040 0.01489 -218.262
## arterial_slow_zones1 0.60064 0.04099 14.653
## enhanced_crossings1 0.12559 0.31186 0.403
## speed_humps1 0.75234 0.03791 19.844
## left_turn_traffic_calming1 1.57088 0.09052 17.354
## leading_pedestrian_interval_signals1 1.63842 0.03630 45.141
## street_improvement_projects_corridors1 1.18903 0.02867 41.467
## neighborhood_slow_zones1 0.07970 0.06747 1.181
## Pr(>|z|)
## (Intercept) <2e-16 ***
## arterial_slow_zones1 <2e-16 ***
## enhanced_crossings1 0.687
## speed_humps1 <2e-16 ***
## left_turn_traffic_calming1 <2e-16 ***
## leading_pedestrian_interval_signals1 <2e-16 ***
## street_improvement_projects_corridors1 <2e-16 ***
## neighborhood_slow_zones1 0.237
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 62859 on 140900 degrees of freedom
## Residual deviance: 56468 on 140893 degrees of freedom
## AIC: 56484
##
## Number of Fisher Scoring iterations: 6
exp(coef(ped_injury))
## (Intercept)
## 0.03875886
## arterial_slow_zones1
## 1.82328425
## enhanced_crossings1
## 1.13381760
## speed_humps1
## 2.12196902
## left_turn_traffic_calming1
## 4.81087968
## leading_pedestrian_interval_signals1
## 5.14702811
## street_improvement_projects_corridors1
## 3.28389579
## neighborhood_slow_zones1
## 1.08296302
exp(confint(ped_injury))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.03763932 0.039902
## arterial_slow_zones1 1.68170429 1.974872
## enhanced_crossings1 0.58471778 2.006762
## speed_humps1 1.96915098 2.284690
## left_turn_traffic_calming1 4.02698873 5.742805
## leading_pedestrian_interval_signals1 4.79263317 5.525446
## street_improvement_projects_corridors1 3.10392500 3.473182
## neighborhood_slow_zones1 0.94709539 1.233903
#GLM - Pedestrian Injuries
#build table to plot Odds Ratios
a <- data.frame(exp(coef((ped_injury))))
a <- cbind(attribute = rownames(a), a)
rownames(a) <- 1:nrow(a)
b <- data.frame(exp(confint((ped_injury))))
## Waiting for profiling to be done...
b <- cbind(attribute = rownames(b), b)
rownames(b) <- 1:nrow(b)
coef_df <- left_join(a, b, by = "attribute")
colnames(coef_df)[2] <- "coefficient"
colnames(coef_df)[3] <- "lower_bound"
colnames(coef_df)[4] <- "upper_bound"
g <- ggplot(coef_df, aes(x = attribute, y = coefficient)) +
geom_point(size = 2) +
geom_errorbar(aes(ymax = upper_bound, ymin = lower_bound), width=0.1, color="darkred") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(data = coef_df, aes(label = round(coefficient, digits=3)), hjust = 1.2) +
ggtitle("Pedestrian Injuries - Coefficients & Confidence Intervals")
plot(g)
#GLM - Bike Injuries
bike_injury <- glm(BikeInjuri ~ bike_priority_districts + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
summary(bike_injury)
##
## Call:
## glm(formula = BikeInjuri ~ bike_priority_districts + enhanced_crossings +
## speed_humps + left_turn_traffic_calming + arterial_slow_zones +
## signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones,
## family = "binomial", data = d.in)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4369 -0.2289 -0.1702 -0.1702 2.9127
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -4.22758 0.02403 -175.962
## bike_priority_districts1 0.59892 0.04195 14.279
## enhanced_crossings1 0.78188 0.33793 2.314
## speed_humps1 0.65842 0.05399 12.196
## left_turn_traffic_calming1 1.11028 0.11259 9.861
## arterial_slow_zones1 0.26176 0.06044 4.331
## signal_timing1 0.99781 0.04840 20.614
## street_improvement_projects_corridors1 1.19253 0.04184 28.499
## neighborhood_slow_zones1 0.44511 0.08568 5.195
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## bike_priority_districts1 < 2e-16 ***
## enhanced_crossings1 0.0207 *
## speed_humps1 < 2e-16 ***
## left_turn_traffic_calming1 < 2e-16 ***
## arterial_slow_zones1 1.49e-05 ***
## signal_timing1 < 2e-16 ***
## street_improvement_projects_corridors1 < 2e-16 ***
## neighborhood_slow_zones1 2.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33168 on 140900 degrees of freedom
## Residual deviance: 30500 on 140892 degrees of freedom
## AIC: 30518
##
## Number of Fisher Scoring iterations: 7
exp(coef(bike_injury))
## (Intercept)
## 0.01458759
## bike_priority_districts1
## 1.82015475
## enhanced_crossings1
## 2.18557277
## speed_humps1
## 1.93173839
## left_turn_traffic_calming1
## 3.03520294
## arterial_slow_zones1
## 1.29921385
## signal_timing1
## 2.71234493
## street_improvement_projects_corridors1
## 3.29540128
## neighborhood_slow_zones1
## 1.56066845
exp(confint(bike_injury))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.01391226 0.01528623
## bike_priority_districts1 1.67560740 1.97510179
## enhanced_crossings1 1.05791075 4.03440590
## speed_humps1 1.73593167 2.14515786
## left_turn_traffic_calming1 2.42455602 3.77097376
## arterial_slow_zones1 1.15280721 1.46106672
## signal_timing1 2.46583508 2.98107918
## street_improvement_projects_corridors1 3.03497519 3.57598794
## neighborhood_slow_zones1 1.31524904 1.84050982
#build table to plot Odds Ratios
a <- data.frame(exp(coef((bike_injury))))
a <- cbind(attribute = rownames(a), a)
rownames(a) <- 1:nrow(a)
b <- data.frame(exp(confint((bike_injury))))
## Waiting for profiling to be done...
b <- cbind(attribute = rownames(b), b)
rownames(b) <- 1:nrow(b)
coef_df <- left_join(a, b, by = "attribute")
colnames(coef_df)[2] <- "coefficient"
colnames(coef_df)[3] <- "lower_bound"
colnames(coef_df)[4] <- "upper_bound"
g <- ggplot(coef_df, aes(x = attribute, y = coefficient)) +
geom_point(size = 2) +
geom_errorbar(aes(ymax = upper_bound, ymin = lower_bound), width=0.1, color="darkred") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(data = coef_df, aes(label = round(coefficient, digits=3)), hjust = 1.2) +
ggtitle("Bike Injuries - Coefficients & Confidence Intervals")
plot(g)
#GLM - MVO Injuries
mvo_injury <- glm(MVOInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
summary(mvo_injury)
##
## Call:
## glm(formula = MVOInjurie ~ arterial_slow_zones + enhanced_crossings +
## speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors +
## neighborhood_slow_zones, family = "binomial", data = d.in)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9923 -0.4408 -0.4408 -0.4408 2.3450
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.282396 0.009952 -229.350
## arterial_slow_zones1 0.692499 0.031505 21.981
## enhanced_crossings1 -0.208199 0.243587 -0.855
## speed_humps1 0.795230 0.028765 27.645
## left_turn_traffic_calming1 1.194577 0.085184 14.024
## street_improvement_projects_corridors1 1.436879 0.021219 67.718
## neighborhood_slow_zones1 -0.400998 0.056930 -7.044
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## arterial_slow_zones1 < 2e-16 ***
## enhanced_crossings1 0.393
## speed_humps1 < 2e-16 ***
## left_turn_traffic_calming1 < 2e-16 ***
## street_improvement_projects_corridors1 < 2e-16 ***
## neighborhood_slow_zones1 1.87e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 105164 on 140900 degrees of freedom
## Residual deviance: 98877 on 140894 degrees of freedom
## AIC: 98891
##
## Number of Fisher Scoring iterations: 5
exp(coef(mvo_injury))
## (Intercept)
## 0.1020394
## arterial_slow_zones1
## 1.9987036
## enhanced_crossings1
## 0.8120452
## speed_humps1
## 2.2149506
## left_turn_traffic_calming1
## 3.3021601
## street_improvement_projects_corridors1
## 4.2075444
## neighborhood_slow_zones1
## 0.6696513
exp(confint(mvo_injury))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1000634 0.1040440
## arterial_slow_zones1 1.8785837 2.1255291
## enhanced_crossings1 0.4931631 1.2868057
## speed_humps1 2.0931148 2.3429646
## left_turn_traffic_calming1 2.7934596 3.9013077
## street_improvement_projects_corridors1 4.0359406 4.3859968
## neighborhood_slow_zones1 0.5981811 0.7477761
#build table to plot Odds Ratios
a <- data.frame(exp(coef((mvo_injury))))
a <- cbind(attribute = rownames(a), a)
rownames(a) <- 1:nrow(a)
b <- data.frame(exp(confint((mvo_injury))))
## Waiting for profiling to be done...
b <- cbind(attribute = rownames(b), b)
rownames(b) <- 1:nrow(b)
coef_df <- left_join(a, b, by = "attribute")
colnames(coef_df)[2] <- "coefficient"
colnames(coef_df)[3] <- "lower_bound"
colnames(coef_df)[4] <- "upper_bound"
g <- ggplot(coef_df, aes(x = attribute, y = coefficient)) +
geom_point(size = 2) +
geom_errorbar(aes(ymax = upper_bound, ymin = lower_bound), width=0.1, color="darkred") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(data = coef_df, aes(label = round(coefficient, digits=3)), hjust = 1.2) +
ggtitle("Motor Vehicle Injuries - Coefficients & Confidence Intervals")
plot(g)
Build Predictive Model
#GENERAL INJURY
#split data 70-30
set.seed(123)
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)
d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ]
#train logistic regression take 1 (all)
m.log <- glm(Injuries ~ bike_priority_districts + vz_priority_corridors + vz_priority_intersections + vz_priority_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors, data = d.train, family = "binomial")
#bike_priority_districts + vz_priority_corridors + vz_priority_intersections + vz_priority_zones +
#train logistic regression take 2 (only real safety features)
#m.log <- glm(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + neighborhood_slow_zones, data = d.train, family = "binomial")
#train logistic regression take 3 (only problem zones)
#m.log <- glm(Injuries ~ bike_priority_districts + vz_priority_corridors + vz_priority_intersections + vz_priority_zones, data = d.train, family = "binomial")
#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")
#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.1,
1,
0)
# TP
tp <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()
# Sensitivity (or TPR)
sens <- tp/(tp+fn)
# Specificity ( 1 - FPR)
spec <- 1 - (tn/(tn+fp))
# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf)
auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.7053684
Decision Tree
#Build Decision Tree
mtree.1 <- rpart(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors, data=d.train, method="class")
rpart.plot(mtree.1)
#Plot variables of importance
d.var_imp <-data.frame(mtree.1$variable.importance)
names(d.var_imp) <- "importance"
d.var_imp$variable <-as.factor(rownames(d.var_imp))
d.var_imp <-transform(d.var_imp, variable=reorder(variable, -importance) )
filt5 <- d.var_imp %>% top_n(-5)
## Selecting by variable
g <-ggplot(filt5,aes(x=variable, y=importance)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(g)
Random Forest
#Generate RF
mrf.1 <- randomForest(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors, data=d.train, importance = TRUE, ntree=1000)
#plot variables of importance
varImpPlot(mrf.1)
#plot OOB error
mrf.1$importance
## 0 1
## enhanced_crossings 3.297610e-05 0.0000206177
## speed_humps -6.421188e-05 0.0040348501
## left_turn_traffic_calming 2.673379e-03 0.0029948626
## arterial_slow_zones -1.997459e-04 0.0044761049
## leading_pedestrian_interval_signals 2.398093e-02 0.0465890394
## signal_timing 2.736861e-02 0.0644103343
## safe_streets_for_seniors -6.891478e-04 0.0065085950
## street_improvement_projects_corridors 4.464614e-03 0.0314366349
## MeanDecreaseAccuracy
## enhanced_crossings 3.055889e-05
## speed_humps 7.360478e-04
## left_turn_traffic_calming 2.736216e-03
## arterial_slow_zones 7.138060e-04
## leading_pedestrian_interval_signals 2.839536e-02
## signal_timing 3.460155e-02
## safe_streets_for_seniors 7.150871e-04
## street_improvement_projects_corridors 9.727956e-03
## MeanDecreaseGini
## enhanced_crossings 4.404694
## speed_humps 257.116457
## left_turn_traffic_calming 191.690280
## arterial_slow_zones 142.394249
## leading_pedestrian_interval_signals 1900.152241
## signal_timing 1593.838852
## safe_streets_for_seniors 247.008181
## street_improvement_projects_corridors 1616.682815
x <- gg_error(mrf.1)
plot(x)
#predict diagnosis (tree)
d.test$Predicted_Injury <- predict(mtree.1, d.test, type="class")
#form prediction object
tree_pred <- prediction(predictions=as.numeric(d.test$Predicted_Injury), labels =as.numeric(d.test$Injuries))
#calculate AUC
auc.perf <- performance(tree_pred, measure = "auc")
tree_auc <- auc.perf@y.values
#predict diagnosis (forest)
d.test$Predicted_Injury <- predict(mrf.1, d.test, type="class")
#form prediction object (forest)
rf_pred <-prediction(predictions=as.numeric(d.test$Predicted_Injury), labels =as.numeric(d.test$Injuries))
#calculate auc
auc.perf <- performance(rf_pred, measure = "auc")
rf_auc <- auc.perf@y.values
tree_auc
## [[1]]
## [1] 0.6380118
rf_auc
## [[1]]
## [1] 0.6561914
Pedestrian Injury Predictions
#PEDESTRIAN INJURY
set.seed(123)
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)
d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ]
#train logistic regression
m.log <- glm(PedInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)
#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")
#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.05,
1,
0)
# TP
tp <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()
# Sensitivity (or TPR)
sens <- tp/(tp+fn)
# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)
# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)
# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1222029
# FNR
fn/(fn+tp)
## [1] 0.5208333
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf)
auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6784819
Bike Injury
#BIKE INJURY
set.seed(123)
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)
d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ]
#train logistic regression
m.log <- glm(BikeInjuri ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)
#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")
#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.03,
1,
0)
# TP
tp <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()
# Sensitivity (or TPR)
sens <- tp/(tp+fn)
# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)
# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)
# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1069607
# FNR
fn/(fn+tp)
## [1] 0.5267002
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf)
auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6831696
MVO Injury
#MVO INJURY
set.seed(123)
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)
d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ]
#train logistic regression
m.log <- glm(MVOInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)
#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")
#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.1,
1,
0)
# TP
tp <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()
# Sensitivity (or TPR)
sens <- tp/(tp+fn)
# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)
# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)
# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1192252
# FNR
fn/(fn+tp)
## [1] 0.5763889
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf)
auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6521929